{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from os import walk\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw/export2017.09.15-09.40.09.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw/export2017.09.15-10.06.22.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw/export2017.09.15-10.02.56.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw/export2017.09.15-09.51.57.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw/export2017.09.15-09.43.17.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw/export2017.09.15-10.00.17.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw/export2017.09.15-09.34.03.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw/export2017.09.15-09.57.53.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw/export2017.09.15-10.04.23.csv\n",
      "(614, 31)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document Title</th>\n",
       "      <th>Authors</th>\n",
       "      <th>Author Affiliations</th>\n",
       "      <th>Publication Title</th>\n",
       "      <th>Date Added To Xplore</th>\n",
       "      <th>Year</th>\n",
       "      <th>Volume</th>\n",
       "      <th>Issue</th>\n",
       "      <th>Start Page</th>\n",
       "      <th>End Page</th>\n",
       "      <th>...</th>\n",
       "      <th>Article Citation Count</th>\n",
       "      <th>Patent Citation Count</th>\n",
       "      <th>Reference Count</th>\n",
       "      <th>Copyright Year</th>\n",
       "      <th>License</th>\n",
       "      <th>Online Date</th>\n",
       "      <th>Issue Date</th>\n",
       "      <th>Meeting Date</th>\n",
       "      <th>Publisher</th>\n",
       "      <th>Document Identifier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Multi-Scale Rotation-Invariant Convolutional N...</td>\n",
       "      <td>Q. Wang; Y. Zheng; g. yang; W. Jin; X. Chen; y...</td>\n",
       "      <td>School of Computer Science and Technology, Sha...</td>\n",
       "      <td>IEEE Journal of Biomedical and Health Informatics</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2017</td>\n",
       "      <td>PP</td>\n",
       "      <td>99.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20170321.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Early Access Articles</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adrenal lesions detection on low-contrast CT i...</td>\n",
       "      <td>L. Bi; J. Kim; T. Su; M. Fulham; D. Feng; G. Ning</td>\n",
       "      <td>School of Information Technologies, University...</td>\n",
       "      <td>2017 IEEE 14th International Symposium on Biom...</td>\n",
       "      <td>20170619.0</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>895</td>\n",
       "      <td>898</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18-21 April 2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Conference Publications</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Anatomy-specific classification of medical ima...</td>\n",
       "      <td>H. R. Roth; C. T. Lee; H. C. Shin; A. Seff; L....</td>\n",
       "      <td>Imaging Biomarkers and Computer-Aided Diagnosi...</td>\n",
       "      <td>2015 IEEE 12th International Symposium on Biom...</td>\n",
       "      <td>20150723.0</td>\n",
       "      <td>2015</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>104</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16-19 April 2015</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Conference Publications</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Deep Convolutional Neural Network for Inverse ...</td>\n",
       "      <td>K. H. Jin; M. T. McCann; E. Froustey; M. Unser</td>\n",
       "      <td>Biomedical Imaging Group, &amp;#x00C9;cole Polytec...</td>\n",
       "      <td>IEEE Transactions on Image Processing</td>\n",
       "      <td>20170711.0</td>\n",
       "      <td>2017</td>\n",
       "      <td>26</td>\n",
       "      <td>9.0</td>\n",
       "      <td>4509</td>\n",
       "      <td>4522</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20170615.0</td>\n",
       "      <td>Sept. 2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Journals &amp; Magazines</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Convolutional Neural Networks for Medical Imag...</td>\n",
       "      <td>N. Tajbakhsh; J. Y. Shin; S. R. Gurudu; R. T. ...</td>\n",
       "      <td>Department of Biomedical Informatics, Arizona ...</td>\n",
       "      <td>IEEE Transactions on Medical Imaging</td>\n",
       "      <td>20160429.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>35</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1299</td>\n",
       "      <td>1312</td>\n",
       "      <td>...</td>\n",
       "      <td>34.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>76.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20160307.0</td>\n",
       "      <td>May 2016</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Journals &amp; Magazines</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 31 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      Document Title  \\\n",
       "0  Multi-Scale Rotation-Invariant Convolutional N...   \n",
       "1  Adrenal lesions detection on low-contrast CT i...   \n",
       "2  Anatomy-specific classification of medical ima...   \n",
       "3  Deep Convolutional Neural Network for Inverse ...   \n",
       "4  Convolutional Neural Networks for Medical Imag...   \n",
       "\n",
       "                                             Authors  \\\n",
       "0  Q. Wang; Y. Zheng; g. yang; W. Jin; X. Chen; y...   \n",
       "1  L. Bi; J. Kim; T. Su; M. Fulham; D. Feng; G. Ning   \n",
       "2  H. R. Roth; C. T. Lee; H. C. Shin; A. Seff; L....   \n",
       "3     K. H. Jin; M. T. McCann; E. Froustey; M. Unser   \n",
       "4  N. Tajbakhsh; J. Y. Shin; S. R. Gurudu; R. T. ...   \n",
       "\n",
       "                                 Author Affiliations  \\\n",
       "0  School of Computer Science and Technology, Sha...   \n",
       "1  School of Information Technologies, University...   \n",
       "2  Imaging Biomarkers and Computer-Aided Diagnosi...   \n",
       "3  Biomedical Imaging Group, &#x00C9;cole Polytec...   \n",
       "4  Department of Biomedical Informatics, Arizona ...   \n",
       "\n",
       "                                   Publication Title  Date Added To Xplore  \\\n",
       "0  IEEE Journal of Biomedical and Health Informatics                   NaN   \n",
       "1  2017 IEEE 14th International Symposium on Biom...            20170619.0   \n",
       "2  2015 IEEE 12th International Symposium on Biom...            20150723.0   \n",
       "3              IEEE Transactions on Image Processing            20170711.0   \n",
       "4               IEEE Transactions on Medical Imaging            20160429.0   \n",
       "\n",
       "   Year Volume  Issue Start Page End Page              ...               \\\n",
       "0  2017     PP   99.0          1        1              ...                \n",
       "1  2017    NaN    NaN        895      898              ...                \n",
       "2  2015    NaN    NaN        101      104              ...                \n",
       "3  2017     26    9.0       4509     4522              ...                \n",
       "4  2016     35    5.0       1299     1312              ...                \n",
       "\n",
       "  Article Citation Count Patent Citation Count Reference Count Copyright Year  \\\n",
       "0                    NaN                   NaN             NaN            NaN   \n",
       "1                    NaN                   NaN             NaN            NaN   \n",
       "2                    5.0                   NaN            16.0            NaN   \n",
       "3                    NaN                   NaN             NaN            NaN   \n",
       "4                   34.0                   NaN            76.0            NaN   \n",
       "\n",
       "  License Online Date        Issue Date Meeting Date Publisher  \\\n",
       "0     NaN  20170321.0               NaN          NaN      IEEE   \n",
       "1     NaN         NaN  18-21 April 2017          NaN      IEEE   \n",
       "2     NaN         NaN  16-19 April 2015          NaN      IEEE   \n",
       "3     NaN  20170615.0        Sept. 2017          NaN      IEEE   \n",
       "4     NaN  20160307.0          May 2016          NaN      IEEE   \n",
       "\n",
       "            Document Identifier  \n",
       "0    IEEE Early Access Articles  \n",
       "1  IEEE Conference Publications  \n",
       "2  IEEE Conference Publications  \n",
       "3     IEEE Journals & Magazines  \n",
       "4     IEEE Journals & Magazines  \n",
       "\n",
       "[5 rows x 31 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path = '/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw1'\n",
    "dfs = []\n",
    "for (_, _, filenames) in walk(path):\n",
    "    for filename in filenames:\n",
    "        fullPath = path+'/'+filename\n",
    "        print fullPath\n",
    "        dfs.append( pd.read_csv(fullPath,header=1) )\n",
    "df = pd.concat(dfs)\n",
    "# reset index\n",
    "df = df.reset_index(drop=True)\n",
    "\n",
    "# remove duplicates by title\n",
    "df = df.drop_duplicates(subset='DOI', keep=\"first\")\n",
    "df = df[df['Year']>=2014]\n",
    "# reset index\n",
    "df = df.reset_index(drop=True)\n",
    "print df.shape\n",
    "df.head()\n",
    "\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.to_csv( 'dirty_combined_1.csv' )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# manually clean dirty_combined_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# do second round"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2/export2017.09.18-11.21.02.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2/export2017.09.18-11.24.36.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2/export2017.09.18-11.18.09.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2/export2017.09.18-11.30.43.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2/export2017.09.18-11.17.07.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2/export2017.09.18-11.27.43.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2/export2017.09.18-11.25.14.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2/export2017.09.18-11.14.06.csv\n",
      "/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2/export2017.09.18-11.14.46.csv\n",
      "(193, 31)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document Title</th>\n",
       "      <th>Authors</th>\n",
       "      <th>Author Affiliations</th>\n",
       "      <th>Publication Title</th>\n",
       "      <th>Date Added To Xplore</th>\n",
       "      <th>Year</th>\n",
       "      <th>Volume</th>\n",
       "      <th>Issue</th>\n",
       "      <th>Start Page</th>\n",
       "      <th>End Page</th>\n",
       "      <th>...</th>\n",
       "      <th>Article Citation Count</th>\n",
       "      <th>Patent Citation Count</th>\n",
       "      <th>Reference Count</th>\n",
       "      <th>Copyright Year</th>\n",
       "      <th>License</th>\n",
       "      <th>Online Date</th>\n",
       "      <th>Issue Date</th>\n",
       "      <th>Meeting Date</th>\n",
       "      <th>Publisher</th>\n",
       "      <th>Document Identifier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Integrating Online and Offline Three-Dimension...</td>\n",
       "      <td>L. Yu; H. Chen; Q. Dou; J. Qin; P. A. Heng</td>\n",
       "      <td>Department of Computer Science and Engineering...</td>\n",
       "      <td>IEEE Journal of Biomedical and Health Informatics</td>\n",
       "      <td>20170520.0</td>\n",
       "      <td>2017</td>\n",
       "      <td>21</td>\n",
       "      <td>1.0</td>\n",
       "      <td>65</td>\n",
       "      <td>75</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20161207.0</td>\n",
       "      <td>Jan. 2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Journals &amp; Magazines</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Automatic Detection and Classification of Colo...</td>\n",
       "      <td>R. Zhang; Y. Zheng; T. W. C. Mak; R. Yu; S. H....</td>\n",
       "      <td>Department of Surgery, The Chinese University ...</td>\n",
       "      <td>IEEE Journal of Biomedical and Health Informatics</td>\n",
       "      <td>20170520.0</td>\n",
       "      <td>2017</td>\n",
       "      <td>21</td>\n",
       "      <td>1.0</td>\n",
       "      <td>41</td>\n",
       "      <td>47</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20161205.0</td>\n",
       "      <td>Jan. 2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Journals &amp; Magazines</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Comparison of hand-craft feature based SVM and...</td>\n",
       "      <td>Y. Shin; I. Balasingham</td>\n",
       "      <td>Department Electronics and Telecommunications ...</td>\n",
       "      <td>2017 39th Annual International Conference of t...</td>\n",
       "      <td>20170914.0</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3277</td>\n",
       "      <td>3280</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11-15 July 2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Conference Publications</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Combining Convolutional and Recurrent Neural N...</td>\n",
       "      <td>H. Zuo; H. Fan; E. Blasch; H. Ling</td>\n",
       "      <td>Department of Chemical Equipment and Control E...</td>\n",
       "      <td>IEEE Signal Processing Letters</td>\n",
       "      <td>20170209.0</td>\n",
       "      <td>2017</td>\n",
       "      <td>24</td>\n",
       "      <td>3.0</td>\n",
       "      <td>289</td>\n",
       "      <td>293</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20170117.0</td>\n",
       "      <td>March 2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Journals &amp; Magazines</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Deep Learning Segmentation of Optical Microsco...</td>\n",
       "      <td>R. Li; T. Zeng; H. Peng; S. Ji</td>\n",
       "      <td>School of Electrical Engineering and Computer ...</td>\n",
       "      <td>IEEE Transactions on Medical Imaging</td>\n",
       "      <td>20170628.0</td>\n",
       "      <td>2017</td>\n",
       "      <td>36</td>\n",
       "      <td>7.0</td>\n",
       "      <td>1533</td>\n",
       "      <td>1541</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20170308.0</td>\n",
       "      <td>July 2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Journals &amp; Magazines</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 31 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      Document Title  \\\n",
       "0  Integrating Online and Offline Three-Dimension...   \n",
       "1  Automatic Detection and Classification of Colo...   \n",
       "2  Comparison of hand-craft feature based SVM and...   \n",
       "3  Combining Convolutional and Recurrent Neural N...   \n",
       "4  Deep Learning Segmentation of Optical Microsco...   \n",
       "\n",
       "                                             Authors  \\\n",
       "0         L. Yu; H. Chen; Q. Dou; J. Qin; P. A. Heng   \n",
       "1  R. Zhang; Y. Zheng; T. W. C. Mak; R. Yu; S. H....   \n",
       "2                            Y. Shin; I. Balasingham   \n",
       "3                 H. Zuo; H. Fan; E. Blasch; H. Ling   \n",
       "4                     R. Li; T. Zeng; H. Peng; S. Ji   \n",
       "\n",
       "                                 Author Affiliations  \\\n",
       "0  Department of Computer Science and Engineering...   \n",
       "1  Department of Surgery, The Chinese University ...   \n",
       "2  Department Electronics and Telecommunications ...   \n",
       "3  Department of Chemical Equipment and Control E...   \n",
       "4  School of Electrical Engineering and Computer ...   \n",
       "\n",
       "                                   Publication Title  Date Added To Xplore  \\\n",
       "0  IEEE Journal of Biomedical and Health Informatics            20170520.0   \n",
       "1  IEEE Journal of Biomedical and Health Informatics            20170520.0   \n",
       "2  2017 39th Annual International Conference of t...            20170914.0   \n",
       "3                     IEEE Signal Processing Letters            20170209.0   \n",
       "4               IEEE Transactions on Medical Imaging            20170628.0   \n",
       "\n",
       "   Year Volume  Issue Start Page End Page              ...               \\\n",
       "0  2017     21    1.0         65       75              ...                \n",
       "1  2017     21    1.0         41       47              ...                \n",
       "2  2017    NaN    NaN       3277     3280              ...                \n",
       "3  2017     24    3.0        289      293              ...                \n",
       "4  2017     36    7.0       1533     1541              ...                \n",
       "\n",
       "  Article Citation Count Patent Citation Count Reference Count Copyright Year  \\\n",
       "0                    NaN                   NaN             NaN            NaN   \n",
       "1                    NaN                   NaN             NaN            NaN   \n",
       "2                    NaN                   NaN             NaN            NaN   \n",
       "3                    NaN                   NaN             NaN            NaN   \n",
       "4                    NaN                   NaN             NaN            NaN   \n",
       "\n",
       "  License Online Date       Issue Date Meeting Date Publisher  \\\n",
       "0     NaN  20161207.0        Jan. 2017          NaN      IEEE   \n",
       "1     NaN  20161205.0        Jan. 2017          NaN      IEEE   \n",
       "2     NaN         NaN  11-15 July 2017          NaN      IEEE   \n",
       "3     NaN  20170117.0       March 2017          NaN      IEEE   \n",
       "4     NaN  20170308.0        July 2017          NaN      IEEE   \n",
       "\n",
       "            Document Identifier  \n",
       "0     IEEE Journals & Magazines  \n",
       "1     IEEE Journals & Magazines  \n",
       "2  IEEE Conference Publications  \n",
       "3     IEEE Journals & Magazines  \n",
       "4     IEEE Journals & Magazines  \n",
       "\n",
       "[5 rows x 31 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path = '/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/raw2'\n",
    "dfs = []\n",
    "for (_, _, filenames) in walk(path):\n",
    "    for filename in filenames:\n",
    "        fullPath = path+'/'+filename\n",
    "        print fullPath\n",
    "        dfs.append( pd.read_csv(fullPath,header=1) )\n",
    "df = pd.concat(dfs)\n",
    "# reset index\n",
    "df = df.reset_index(drop=True)\n",
    "\n",
    "# remove duplicates by title\n",
    "df = df.drop_duplicates(subset='DOI', keep=\"first\")\n",
    "df = df[df['Year']>=2014]\n",
    "# reset index\n",
    "df = df.reset_index(drop=True)\n",
    "print df.shape\n",
    "df.head()\n",
    "\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.to_csv( 'dirty_combined_2.csv' )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# manually clean dirty_combined_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# combine both cleans into 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(490, 31)\n",
      "(154, 31)\n",
      "(644, 31)\n",
      "(564, 31)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document Title</th>\n",
       "      <th>Authors</th>\n",
       "      <th>Author Affiliations</th>\n",
       "      <th>Publication Title</th>\n",
       "      <th>Date Added To Xplore</th>\n",
       "      <th>Year</th>\n",
       "      <th>Volume</th>\n",
       "      <th>Issue</th>\n",
       "      <th>Start Page</th>\n",
       "      <th>End Page</th>\n",
       "      <th>...</th>\n",
       "      <th>Article Citation Count</th>\n",
       "      <th>Patent Citation Count</th>\n",
       "      <th>Reference Count</th>\n",
       "      <th>Copyright Year</th>\n",
       "      <th>License</th>\n",
       "      <th>Online Date</th>\n",
       "      <th>Issue Date</th>\n",
       "      <th>Meeting Date</th>\n",
       "      <th>Publisher</th>\n",
       "      <th>Document Identifier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Multi-Scale Rotation-Invariant Convolutional N...</td>\n",
       "      <td>Q. Wang; Y. Zheng; g. yang; W. Jin; X. Chen; y...</td>\n",
       "      <td>School of Computer Science and Technology, Sha...</td>\n",
       "      <td>IEEE Journal of Biomedical and Health Informatics</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2017</td>\n",
       "      <td>PP</td>\n",
       "      <td>99.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20170321.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Early Access Articles</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adrenal lesions detection on low-contrast CT i...</td>\n",
       "      <td>L. Bi; J. Kim; T. Su; M. Fulham; D. Feng; G. Ning</td>\n",
       "      <td>School of Information Technologies, University...</td>\n",
       "      <td>2017 IEEE 14th International Symposium on Biom...</td>\n",
       "      <td>20170619.0</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>895</td>\n",
       "      <td>898</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18-21 April 2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Conference Publications</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Anatomy-specific classification of medical ima...</td>\n",
       "      <td>H. R. Roth; C. T. Lee; H. C. Shin; A. Seff; L....</td>\n",
       "      <td>Imaging Biomarkers and Computer-Aided Diagnosi...</td>\n",
       "      <td>2015 IEEE 12th International Symposium on Biom...</td>\n",
       "      <td>20150723.0</td>\n",
       "      <td>2015</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>104</td>\n",
       "      <td>...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16-19 April 2015</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Conference Publications</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Deep Convolutional Neural Network for Inverse ...</td>\n",
       "      <td>K. H. Jin; M. T. McCann; E. Froustey; M. Unser</td>\n",
       "      <td>Biomedical Imaging Group, &amp;#x00C9;cole Polytec...</td>\n",
       "      <td>IEEE Transactions on Image Processing</td>\n",
       "      <td>20170711.0</td>\n",
       "      <td>2017</td>\n",
       "      <td>26</td>\n",
       "      <td>9.0</td>\n",
       "      <td>4509</td>\n",
       "      <td>4522</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20170615.0</td>\n",
       "      <td>Sept. 2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Journals &amp; Magazines</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Convolutional Neural Networks for Medical Imag...</td>\n",
       "      <td>N. Tajbakhsh; J. Y. Shin; S. R. Gurudu; R. T. ...</td>\n",
       "      <td>Department of Biomedical Informatics, Arizona ...</td>\n",
       "      <td>IEEE Transactions on Medical Imaging</td>\n",
       "      <td>20160429.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>35</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1299</td>\n",
       "      <td>1312</td>\n",
       "      <td>...</td>\n",
       "      <td>34.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>76.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20160307.0</td>\n",
       "      <td>May 2016</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IEEE</td>\n",
       "      <td>IEEE Journals &amp; Magazines</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 31 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      Document Title  \\\n",
       "0  Multi-Scale Rotation-Invariant Convolutional N...   \n",
       "1  Adrenal lesions detection on low-contrast CT i...   \n",
       "2  Anatomy-specific classification of medical ima...   \n",
       "3  Deep Convolutional Neural Network for Inverse ...   \n",
       "4  Convolutional Neural Networks for Medical Imag...   \n",
       "\n",
       "                                             Authors  \\\n",
       "0  Q. Wang; Y. Zheng; g. yang; W. Jin; X. Chen; y...   \n",
       "1  L. Bi; J. Kim; T. Su; M. Fulham; D. Feng; G. Ning   \n",
       "2  H. R. Roth; C. T. Lee; H. C. Shin; A. Seff; L....   \n",
       "3     K. H. Jin; M. T. McCann; E. Froustey; M. Unser   \n",
       "4  N. Tajbakhsh; J. Y. Shin; S. R. Gurudu; R. T. ...   \n",
       "\n",
       "                                 Author Affiliations  \\\n",
       "0  School of Computer Science and Technology, Sha...   \n",
       "1  School of Information Technologies, University...   \n",
       "2  Imaging Biomarkers and Computer-Aided Diagnosi...   \n",
       "3  Biomedical Imaging Group, &#x00C9;cole Polytec...   \n",
       "4  Department of Biomedical Informatics, Arizona ...   \n",
       "\n",
       "                                   Publication Title  Date Added To Xplore  \\\n",
       "0  IEEE Journal of Biomedical and Health Informatics                   NaN   \n",
       "1  2017 IEEE 14th International Symposium on Biom...            20170619.0   \n",
       "2  2015 IEEE 12th International Symposium on Biom...            20150723.0   \n",
       "3              IEEE Transactions on Image Processing            20170711.0   \n",
       "4               IEEE Transactions on Medical Imaging            20160429.0   \n",
       "\n",
       "   Year Volume  Issue  Start Page  End Page              ...               \\\n",
       "0  2017     PP   99.0           1         1              ...                \n",
       "1  2017    NaN    NaN         895       898              ...                \n",
       "2  2015    NaN    NaN         101       104              ...                \n",
       "3  2017     26    9.0        4509      4522              ...                \n",
       "4  2016     35    5.0        1299      1312              ...                \n",
       "\n",
       "  Article Citation Count Patent Citation Count Reference Count Copyright Year  \\\n",
       "0                    NaN                   NaN             NaN            NaN   \n",
       "1                    NaN                   NaN             NaN            NaN   \n",
       "2                    5.0                   NaN            16.0            NaN   \n",
       "3                    NaN                   NaN             NaN            NaN   \n",
       "4                   34.0                   NaN            76.0            NaN   \n",
       "\n",
       "  License Online Date        Issue Date Meeting Date Publisher  \\\n",
       "0     NaN  20170321.0               NaN          NaN      IEEE   \n",
       "1     NaN         NaN  18-21 April 2017          NaN      IEEE   \n",
       "2     NaN         NaN  16-19 April 2015          NaN      IEEE   \n",
       "3     NaN  20170615.0        Sept. 2017          NaN      IEEE   \n",
       "4     NaN  20160307.0          May 2016          NaN      IEEE   \n",
       "\n",
       "            Document Identifier  \n",
       "0    IEEE Early Access Articles  \n",
       "1  IEEE Conference Publications  \n",
       "2  IEEE Conference Publications  \n",
       "3     IEEE Journals & Magazines  \n",
       "4     IEEE Journals & Magazines  \n",
       "\n",
       "[5 rows x 31 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df.from_csv('/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/clean_combined_1.csv',index_col=None)\n",
    "print df1.shape\n",
    "df2 = df.from_csv('/home/ahmed/Dropbox/DFCI/08_radiomics.io/science/ieee/clean_combined_2.csv',index_col=None)\n",
    "print df2.shape\n",
    "df = pd.concat([df1,df2])\n",
    "print df.shape\n",
    "# remove duplicates by title\n",
    "df = df.drop_duplicates(subset='DOI', keep=\"first\")\n",
    "# reset index\n",
    "df = df.reset_index(drop=True)\n",
    "print df.shape\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.to_csv( 'ieee.csv' )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venvradiomicsio",
   "language": "python",
   "name": "venvradiomicsio"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
